iscream

Make fast and efficient BED file queries

James Eapen

June 25, 2025

iscream overview

  • All in R
  • Supported by C++, htslib, Armadillo
  • All operations a single line of code
  • Zero-copy modifications by reference where possible
  • Multithreaded, respects environment limits

Make tabix queries

files <- list.files("*.bed.gz")
regions <- read.table(
  "regions.bed",
  sep = "\t",
  col.names = c("chr", "start", "end")
)

query <- tabix(files, regions, col.names = "A", "B", "C")
            chr     start       end     A     B     C    file
         <char>     <int>     <int> <num> <int> <int>  <char>
      1:      1   4785488   4785488     0     0     2 cell_01
      2:      1   4785513   4785513     0     0     2 cell_01
      3:      1   4785522   4785522     0     0     2 cell_01
      4:      1   4785533   4785533     0     0     2 cell_01
      5:      1   4786780   4786780   100     1     0 cell_01
     ---                                                     
2201516:      X 168673020 168673020   100     1     0 cell_30
2201517:      X 168673032 168673032     0     0     1 cell_30
2201518:      X 168673164 168673164     0     0     1 cell_30
2201519:      X 168674367 168674367   100     1     0 cell_30
2201520:      X 168675047 168675047   100     1     0 cell_30

Make matrices from data columns

mat <- make_mat(files, regions, column = 4)

mat <- make_mat(files, regions, column = 4, sparse = T)
> head(mat$value)
6 x 30 sparse Matrix of class "dgCMatrix"
  [[ suppressing 30 column names ‘cell_01’, ‘cell_02’, ‘cell_03’ ... ]]
                                                                
[1,] . . . . . . . . . . . . . . . . 2 . . . 1 . 1 . 1 . . 1 . 1
[2,] . . . . . . . . . . . 2 . . . . 1 . 1 . 1 . 1 . 1 . . 1 . 2
[3,] . . . . . . . . . . . . . . . . . . . . 1 . 2 . 1 . . . . .
[4,] . . . . . . . . . . . 1 . . . . . . . . 1 . 1 . 1 . . . . .
[5,] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
[6,] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

Summarize data columns

sum, mean, median, variance, SD, min, max, range, count

summary <- summarize_regions(files, regions, column = 4)
means <- summarize_regions(files, regions, column = 4:6, fun = "mean")

min_max <- summarize_regions(
  files,
  regions,
  column = 4,6,
  col_names = c("A", "C"),
  feature_col = feature,
  fun = c("min", "max")
)
> head(min_max)
    feature    file   A.min   C.min   A.max   C.max
1 feature_1 cell_01       1       1       2       2
2 feature_2 cell_01       0       0       1       2
3 feature_3 cell_01       0       1       1       2
4 feature_4 cell_01       0       0       1       2
5 feature_5 cell_01       0       0       1       2
6 feature_6 cell_01       0       0       1       2